import numpy as npimport pandas as pdimport seaborn as snsimport matplotlib.pyplot as pltfrom matplotlib import rcParams# Set global font properties to ArialrcParams.update( {"font.family": "sans-serif","font.sans-serif": "Arial","pdf.fonttype": 42, # Embed fonts as Type 3 fonts for compatibility"ps.fonttype": 42,"text.usetex": False,"svg.fonttype": "none", })def stardize_columns(df): df.columns = [" ".join(col.strip().split()) for col in df.columns]# Load the datadf = pd.read_csv("Crimes_One_year_prior_to_present_first_1001.csv")stardize_columns(df)
Understanding the Dataset
# Display basic information about the datasetprint(df.info())# Show the first few rowsprint(df.head(3))# Basic data cleaningdf["DATE OF OCCURRENCE"] = pd.to_datetime(df["DATE OF OCCURRENCE"])
Understanding the Dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CASE# 1000 non-null object
1 DATE OF OCCURRENCE 1000 non-null object
2 BLOCK 1000 non-null object
3 IUCR 1000 non-null object
4 PRIMARY DESCRIPTION 1000 non-null object
5 SECONDARY DESCRIPTION 1000 non-null object
6 LOCATION DESCRIPTION 998 non-null object
7 ARREST 1000 non-null object
8 DOMESTIC 1000 non-null object
9 BEAT 1000 non-null int64
10 WARD 1000 non-null int64
11 FBI CD 1000 non-null object
12 X COORDINATE 999 non-null float64
13 Y COORDINATE 999 non-null float64
14 LATITUDE 999 non-null float64
15 LONGITUDE 999 non-null float64
16 LOCATION 999 non-null object
dtypes: float64(4), int64(2), object(11)
memory usage: 132.9+ KB
None
CASE# DATE OF OCCURRENCE BLOCK IUCR \
0 JH117298 01/16/2024 01:00:00 AM 038XX W DIVERSEY AVE 0810
1 JG561057 12/31/2023 04:30:00 PM 004XX N WABASH AVE 0460
2 JG512939 11/21/2023 02:28:00 PM 056XX S ELIZABETH ST 143A
PRIMARY DESCRIPTION SECONDARY DESCRIPTION \
0 THEFT OVER $500
1 BATTERY SIMPLE
2 WEAPONS VIOLATION UNLAWFUL POSSESSION - HANDGUN
LOCATION DESCRIPTION ARREST DOMESTIC BEAT WARD FBI CD \
0 STREET N N 2524 35 06
1 STREET N N 1834 42 08B
2 RESIDENCE - YARD (FRONT / BACK) N N 713 16 15
X COORDINATE Y COORDINATE LATITUDE LONGITUDE \
0 1150337.0 1918345.0 41.931844 -87.722951
1 1176592.0 1902931.0 41.888994 -87.626935
2 1168951.0 1867382.0 41.791613 -87.656025
LOCATION
0 (41.931843966, -87.722950868)
1 (41.888993854, -87.626934833)
2 (41.791613294, -87.656024853)
df['DAY_OF_WEEK'] = df['DATE OF OCCURRENCE'].dt.day_name()plt.figure(figsize=(12, 6))sns.boxplot(data=df, x='DAY_OF_WEEK', y='DATE OF OCCURRENCE').set_ylabel('Date')plt.title('Distribution of Crimes by Day of the Week')plt.show()
Distribution Plots: Histogram
df['HOUR'] = df['DATE OF OCCURRENCE'].dt.hourplt.figure(figsize=(12, 6))sns.histplot(data=df, x='HOUR', bins=24, kde=True)plt.title('Distribution of Crimes by Hour of the Day')plt.show()
Distribution Plots: KDE Plot
plt.figure(figsize=(12, 6))sns.kdeplot(data=df, x='HOUR', hue='PRIMARY DESCRIPTION', common_norm=False)plt.title('Distribution of Different Crime Types by Hour')plt.show()
Relational Plots: Scatter Plot
plt.figure(figsize=(12, 10))sns.scatterplot(data=df, x="LONGITUDE", y="LATITUDE", hue="PRIMARY DESCRIPTION")plt.title("Geographical Distribution of Crimes")plt.show()
Relational Plots: Scatter Plot
Relational Plots: Line Plot
crime_counts = df.groupby('DATE OF OCCURRENCE').size().reset_index(name='COUNT')plt.figure(figsize=(12, 6))sns.lineplot(data=crime_counts, x='DATE OF OCCURRENCE', y='COUNT')plt.title('Crime Trends Over Time')plt.xticks(rotation=45)plt.show()
Advanced Customization
plt.figure(figsize=(14, 8))sns.set_style("whitegrid")sns.set_palette("deep")g = sns.countplot( data=df, y="PRIMARY DESCRIPTION", order=df["PRIMARY DESCRIPTION"].value_counts().index[:10],)g.set_title("Top 10 Crime Types", fontsize=20)g.set_xlabel("Count", fontsize=14)g.set_ylabel("Crime Type", fontsize=14)for i, v inenumerate(df["PRIMARY DESCRIPTION"].value_counts()[:10]): g.text(v +3, i, str(v), color="black", va="center")plt.tight_layout()plt.show()
Advanced Customization
Heatmap
Useful for visualizing correlation between variables
Can show patterns and relationships in complex datasets
# Select numeric columnsnumeric_cols = df.select_dtypes(include=[np.number]).columns# Compute correlation matrixcorr_matrix = df[numeric_cols].corr()# Create a mask for the upper trianglemask = np.triu(np.ones_like(corr_matrix, dtype=bool))# Set up the matplotlib figureplt.figure(figsize=(14, 10))# Create heatmap with only upper trianglesns.heatmap( corr_matrix, mask=mask, annot=True, cmap="coolwarm", vmin=-1, vmax=1, center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, fmt=".2f",)plt.title("Upper Triangle Correlation Heatmap of Numeric Variables", fontsize=16)plt.tight_layout()plt.show()
Customized Heatmap
Pair Plot
Useful for exploring relationships between multiple variables
Creates a grid of scatter plots for each pair of variables
# Select relevant columns for the pair plotcols_to_plot = ["X COORDINATE", "Y COORDINATE", "LATITUDE", "LONGITUDE"]# Add hour of daydf["HOUR"] = pd.to_datetime(df["DATE OF OCCURRENCE"]).dt.hour# Create the pair plotplt.figure(figsize=(10, 10))pairplot = sns.pairplot( df[cols_to_plot + ["HOUR", "PRIMARY DESCRIPTION"]], hue="PRIMARY DESCRIPTION", palette="viridis", plot_kws={"alpha": 0.6}, diag_kind="kde",)plt.suptitle("Pair Plot of Geographic Variables and Hour of Occurrence", y=1.02)plt.tight_layout()plt.show()
Pair Plot
<Figure size 960x960 with 0 Axes>
Regression Plot
Visualizes the relationship between two variables
Includes a linear regression line and confidence interval
sns.lmplot( data=df, x="BEAT", y="WARD", col="ARREST", row="DOMESTIC", height=3, facet_kws=dict(sharex=False, sharey=False), scatter_kws={"alpha": 0.5},)plt.title("Regression Plot: Latitude vs Longitude of Crime Occurrences")plt.show()
Regression Plot
Statistical Estimation: Confidence Intervals
Demonstrates how to visualize statistical estimates
Uses bootstrapped confidence intervals
# Group data by day of week and calculate mean crime countcrime_by_day = df.groupby("DAY_OF_WEEK").size().reset_index(name="COUNT")plt.figure(figsize=(12, 8))sns.barplot( data=crime_by_day, x="DAY_OF_WEEK", y="COUNT", errorbar=("ci", 95), capsize=0.2)plt.title("Average Crime Count by Day of Week with 95% Confidence Intervals")plt.xticks(rotation=45)plt.show()
Statistical Estimation: Confidence Intervals
Advanced Seaborn: FacetGrid
Demonstrates how to create multiple plots in a grid
Useful for comparing distributions across categories
# Create a FacetGridplt.figure(figsize=(20, 20))g = sns.FacetGrid(df, col="PRIMARY DESCRIPTION", col_wrap=3, height=4, aspect=1.5)# Map a histogram to each subplotg.map(plt.hist, "HOUR", bins=24)# Customize the plotg.set_axis_labels("Hour of Day", "Count")g.set_titles("{col_name}")g.fig.suptitle("Distribution of Crimes by Hour for Different Crime Types", y=1.02)g.tight_layout()plt.show()